<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Basic HTML Page with Arial Font</title>
    <style>
        body {
            font-family: Arial, sans-serif;
        }

        .container_title {
            margin-left: 200px;
            margin-right: 200px;
            display: flex;
            justify-content: center;
            align-items: center;
            font-size: 22px;
            flex-direction: column;
        }

        .container {
            width: 1000px;
            /*margin-left: 200px;*/
            /*margin-right: 200px;*/
            line-height: 1.6;
            display: flex;
            justify-content: center;
            align-items: start;
            font-size: 18px;
            flex-direction: column;
            text-align: justify;
        }

        .image-container img {
            width: 1000px; /* 设置图片的宽度 */
            height: auto; /* 保持图片的宽高比 */
        }

        .all {
            display: flex;
            flex-direction: column;
            justify-content: center;
            align-items: center;
        }

        hr {
            width: 100%;
            border: none; /* 去掉默认边框 */
            height: 1px; /* 线的厚度 */
            background-color: #EEEEEE; /* 线的颜色 */
            margin: 50px 0; /* 上下的间隔 */
        }

        .container_copy {
            position: relative;
            padding: 20px;
            background-color: #f4f4f4;
            width: 1000px;
            border: 1px solid #ccc;
        }

        .copy-button {
            position: absolute;
            top: 5px;
            right: 5px;
            cursor: pointer;
        }
    </style>
</head>

<body>

<div class="all">
    <section>
        <div class="container_title" style="text-align: center; margin-top: 50px">
            <!-- <h1>InterSim</h1> -->
            <!-- <p>* Denotes equal contribution</p> -->
            <h2>DriveVLM: The Convergence of Autonomous Driving and Large Vision-Language Models</h2>
            <!-- <h3>A Simulator for <strong>Interactive</strong> Behaviour Simulations</h3> -->
            <p style="font-size: 20px">
                Xiaoyu Tian<sup>1</sup>*, Junru Gu<sup>1</sup>*, Bailin Li<sup>2</sup>*,
                Yicheng Liu<sup>1</sup>*, Yang Wang<sup>2</sup>, Zhiyong Zhao<sup>2</sup>,
                Kun Zhan<sup>2</sup>, Peng Jia<sup>2</sup>, Xianpeng Lang<sup>2</sup>,
                Hang Zhao<sup>1†</sup>
            </p>
            <!--                        <h5><sup>1</sup> Shanghai Qi Zhi Institute, <sup>2</sup> Fudan University, <sup>3</sup> Pegasus Tech,-->
            <div style="font-size: 20px">
                <sup>1</sup> IIIS, Tsinghua University, <sup>2</sup> Li Auto</h5>
            </div>

            <div style="font-size: 14px; margin-top: 20px">
                <p>
                    * Equal contribution. Listing order is random.
                </p>
                <p style="margin-top: -5px">
                    † Corresponding author.
                </p>
            </div>

            <div style="font-size: 16px; margin-top: 5px">
                <p>
                    Conference on Robot Learning (CoRL) 2024
                </p>
            </div>

            <div style="margin-top: 10px; margin-bottom: 50px">
                <a href="https://arxiv.org/abs/2402.12289" target="_blank">
                    <img id="paper-icon" src="images/paper_icon.png" style="width: 30px" alt=""
                         title="Show Paper">
                </a>
                <!--                    <li><a href="DriveVLM.pdf" class="icon style2 fa-file-pdf" download="DriveVLM.pdf"-->
                <!--                           target="_blank"><span-->
                <!--                            class="label">Arxiv</span></a>-->
                <!--                    </li>-->
            </div>
        </div>
    </section>

    <section>
        <div class="container_title">
            <div>
                <iframe width="900" height="506" src="https://www.youtube.com/embed/mt-SdHTTZzA?si=ZnbL5B_FNtdumFlE"
                        title="YouTube video player" frameborder="0"
                        allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
                        referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
            </div>
            <!--        <div>-->
            <!--            <a href="https://www.bilibili.com/video/BV1h1421C7KQ">Watch on Youtube</a>-->
            <!--        </div>-->
            <div>
                <div style="font-size: 16px; margin-top: 10px">
                    <!--                   Watch on <a href="https://www.youtube.com/watch?v=mt-SdHTTZzA" target="_blank">YouTube</a>-->
                    <!--                    or <a href="https://www.bilibili.com/video/BV1h1421C7KQ" target="_blank">Bilibili</a>-->
                    <div style="width: 220px; display: flex; flex-direction: row; align-items: center; justify-content: space-between; ">
                        <div>
                            Watch on
                        </div>
                        <a href="https://www.youtube.com/watch?v=mt-SdHTTZzA" target="_blank">
                            <img src="images/icons/youtube.svg" style="width: 70px" alt="">
                        </a>
                        <div>
                            or
                        </div>
                        <a href="https://www.bilibili.com/video/BV1h1421C7KQ" target="_blank">
                            <img src="images/icons/bilibili.svg" style="width: 50px" alt="">
                        </a>
                    </div>
                </div>
            </div>
        </div>
    </section>

    <hr>

    <section id="third">
        <div class="container">
            <h2>Abstract</h2>
            <p>
                A primary hurdle of autonomous driving in urban environments is understanding complex and long-tail
                scenarios, such as challenging road conditions and delicate human behaviors. We introduce DriveVLM,
                an
                autonomous driving system leveraging Vision-Language Models (VLMs) for enhanced scene understanding
                and
                planning capabilities. DriveVLM integrates a unique combination of reasoning modules for scene
                description, scene analysis, and hierarchical planning. Furthermore, recognizing the limitations of
                VLMs
                in spatial reasoning and heavy computational requirements, we propose DriveVLM-Dual, a hybrid system
                that synergizes the strengths of DriveVLM with the traditional autonomous driving pipeline.
                Experiments
                on both the nuScenes dataset and our SUP-AD dataset demonstrate the efficacy of DriveVLM and
                DriveVLM-Dual in handling complex and unpredictable driving conditions. Finally, we deploy the
                DriveVLM-Dual on a production vehicle, verifying it is effective in real-world autonomous driving
                environments.
            </p>
        </div>
    </section>

    <hr>

    <section>
        <div class="container">
            <h2>DriveVLM</h2>
            <p>DriveVLM accepts sequences of images as input and,
                through a reasoning-based Chain-of-Thought (CoT) mechanism, outputs hierarchical planning
                predictions.
                DriveVLM can optionally incorporate traditional 3D perception and trajectory planning modules to
                achieve
                spatial reasoning capability and real-time trajectory planning.
            </p>
            <!--        <div style="width: 500px">-->
            <!--            <img src="images/pipeline.png" alt="Placeholder Image">-->
            <!--        </div>-->

            <div class="image-container">
                <img src="images/pipeline.png" alt="">
            </div>

            <!--        <span class="image main"></span>-->
            <!--        <img src="images/pipeline.png" alt="">-->
        </div>
    </section>

    <hr>

    <section>
        <div class="container">
            <h2>Data Annotation</h2>

            <p>Data mining and annotation pipeline for building a scene
                understanding dataset:
            </p>
            <div class="image-container"><img src="images/annotation.png" alt=""></div>

            <p>The figure below illustrates a sample scenario with detailed
                annotations. We employ a group of annotators to
                perform the scene annotation, including scene description,
                scene analysis, and planning, except for waypoints, which
                can be auto-labeled from the vehicle’s IMU recordings.
            </p>
            <div class="image-container"><img src="images/dataset.png" alt=""></div>
        </div>
    </section>


    <!-- Relation prediction demos -->

    <!--    <section class="wrapper style1 align-left">-->
    <!--        <div class="inner">-->
    <!--            <h2>Qualitative analysis</h2>-->
    <!--            &lt;!&ndash;            <p>&ndash;&gt;-->
    <!--            &lt;!&ndash;                DriveVLM accurately predicts the current scene conditions and incorporates well-considered planning&ndash;&gt;-->
    <!--            &lt;!&ndash;                decisions regarding the cyclist approaching us.&ndash;&gt;-->
    <!--            &lt;!&ndash;                In the figure below, DriveVLM effectively comprehends the gesture of the traffic police ahead,&ndash;&gt;-->
    <!--            &lt;!&ndash;                signaling the ego vehicle to proceed, and also considers the person riding a tricycle on the right side,&ndash;&gt;-->
    <!--            &lt;!&ndash;                thereby making sensible driving decisions. These qualitative results demonstrate our model's exceptional&ndash;&gt;-->
    <!--            &lt;!&ndash;                ability to understand complex scenarios and make driving plans.&ndash;&gt;-->
    <!--            &lt;!&ndash;            </p>&ndash;&gt;-->
    <!--            &lt;!&ndash;            <div style="display: flex">&ndash;&gt;-->
    <!--            &lt;!&ndash;                <span class="image main"><img src="images/qualitative_1.png" alt=""></span>&ndash;&gt;-->
    <!--            &lt;!&ndash;                <span class="image main"><img src="images/qualitative_2.png" alt=""></span>&ndash;&gt;-->
    <!--            &lt;!&ndash;            </div>&ndash;&gt;-->
    <!--            <p>-->
    <!--                In the figure below, the traffic police signaling to proceed with hand gestures has been accurately-->
    <!--                captured by DriveVLM.-->
    <!--            </p>-->
    <!--            <div>-->
    <!--                <span class="image main"><img src="images/vis_1.png" alt=""></span>-->
    <!--            </div>-->
    <!--            <p>-->
    <!--                In the figure below, DriveVLM precisely detect the fallen tree and its position, subsequently planning-->
    <!--                an appropriate detour trajectory.-->
    <!--            </p>-->
    <!--            <div>-->
    <!--                <span class="image main"><img src="images/vis_2.png" alt=""></span>-->
    <!--            </div>-->
    <!--        </div>-->
    <!--    </section>-->


    <!--    <section class="wrapper style1 align-left">-->
    <!--        <div class="inner">-->
    <!--            <h2>Contact Us</h2>-->

    <!--            <div class="index align-left">-->

    <!--                <form action="https://submit-form.com/3TNtd6SQ">-->
    <!--                    <div class="fields">-->
    <!--                        <div class="field half">-->
    <!--                            <label for="name">Name</label>-->
    <!--                            <input type="text" id="name" name="name" value=""/>-->
    <!--                        </div>-->
    <!--                        <div class="field half">-->
    <!--                            <label for="email">Email</label>-->
    <!--                            <input type="email" name="email" id="email" value="">-->
    <!--                        </div>-->
    <!--                        <div class="field">-->
    <!--                            <label for="message">Message</label>-->
    <!--                            <textarea name="message" id="message" rows="5"></textarea>-->
    <!--                        </div>-->
    <!--                        <div class="field align-center">-->
    <!--                            <ul class="actions">-->
    <!--                                <li><input type="submit" name="submit" id="submit" value="Send This Message"></li>-->
    <!--                            </ul>-->
    <!--                        </div>-->

    <!--                    </div>-->

    <!--                </form>-->
    <!--            </div>-->
    <!--        </div>-->
    <!--    </section>-->

    <hr>

    <section>
        <div class="container">
            <h2>Citation</h2>
            <!--            <blockquote style="text-align:left; background-color:#EEEEEE">-->
            <!--            </blockquote>-->

            <div class="container_copy">
                <button class="copy-button">Copy</button>
                <p id="text-to-copy">
                    @article{DriveVLM, <br>
                    &nbsp; &nbsp; title={DriveVLM: The Convergence of Autonomous Driving and Large Vision-Language
                    Models},<br>
                    &nbsp; &nbsp; author={Xiaoyu Tian and Junru Gu and Bailin Li and Yicheng Liu and Zhiyong Zhao and
                    Yang Wang and Kun Zhan and Peng Jia and Xianpeng Lang and Hang Zhao},<br>
                    &nbsp; &nbsp; journal={arXiv preprint arXiv:2402.12289},<br>
                    &nbsp; &nbsp; year={2024}<br>
                    }
            </div>
        </div>
    </section>

    <div style="margin-bottom: 100px">

    </div>
</div>

<script>
  document.querySelector('.copy-button').addEventListener('click', function () {
    var text = document.getElementById('text-to-copy');
    var range = document.createRange();
    window.getSelection().removeAllRanges(); // 清除页面上的所有选择
    range.selectNode(text);
    window.getSelection().addRange(range); // 选择文本
    try {
      var successful = document.execCommand('copy'); // 执行复制
      var msg = successful ? '成功复制!' : '复制失败';
      console.log(msg);
    } catch (err) {
      console.log('复制失败', err);
    }
    window.getSelection().removeAllRanges(); // 清除选择
  });
</script>

</body>

</html>